Sampling

Prof. Dr. Jörg Schoder

2023-05-29

Urnenmodell

Quelle: moderndive.com

Zufallsexperiment und Stichprobenziehung

Stichprobenziehung

Ergebnis einer Stichprobe

Ergebnis mehrerer Stichproben

Datensatz zum physischen Experiment

library(moderndive)
tactile_prop_red
## # A tibble: 33 × 4
##    group            replicate red_balls prop_red
##    <chr>                <int>     <int>    <dbl>
##  1 Ilyas, Yohan             1        21     0.42
##  2 Morgan, Terrance         2        17     0.34
##  3 Martin, Thomas           3        21     0.42
##  4 Clark, Frank             4        21     0.42
##  5 Riddhi, Karina           5        18     0.36
##  6 Andrew, Tyler            6        19     0.38
##  7 Julia                    7        19     0.38
##  8 Rachel, Lauren           8        11     0.22
##  9 Daniel, Caroline         9        15     0.3 
## 10 Josh, Maeve             10        17     0.34
## # ℹ 23 more rows

Stichprobenverteilung

library(tidyverse)
ggplot(tactile_prop_red, aes(x = prop_red)) +
  geom_histogram(binwidth = 0.05, boundary = 0.4,
                 color = "white") +
  scale_y_continuous(limits = c(0, 10), breaks = c(0:10)) +
  labs(x = "Anteil roter Kugeln aus insgesamt 50 (roten und weißen) Kugeln",
       y="Anzahl",
       title = "Verteilung von 33 Anteilswerten roter Kugeln")

Wahre Verteilung

Daten

bowl
## # A tibble: 2,400 × 2
##    ball_ID color
##      <int> <chr>
##  1       1 white
##  2       2 white
##  3       3 white
##  4       4 red  
##  5       5 white
##  6       6 white
##  7       7 red  
##  8       8 white
##  9       9 red  
## 10      10 white
## # ℹ 2,390 more rows

Anzahl und Anteil roter Kugeln

red_true <- bowl %>%
               summarize(Anzahl_rot = sum(color == "red"),
                     Anteil_rot = sum(color == "red")/length(color)
            )
red_true  %>%
    mutate(Anteil_rot=paste0(Anteil_rot*100,"%"))
## # A tibble: 1 × 2
##   Anzahl_rot Anteil_rot
##        <int> <chr>     
## 1        900 37.5%

(Virtuelles) Sampling und Punktschätzung

Unterschiedliche Schaufelgrößen

“Kleine Schaufel” (Stichprobengröße n = 25)

n <- 25
rep <- 1000
virtual_samples_25 <- bowl %>%
                        rep_sample_n(size = n,
                                     reps = rep)
virtual_samples_25
## # A tibble: 25,000 × 3
## # Groups:   replicate [1,000]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1    2017 white
##  2         1     837 red  
##  3         1     619 red  
##  4         1    2037 white
##  5         1     576 red  
##  6         1    1451 red  
##  7         1     860 white
##  8         1     847 white
##  9         1     294 white
## 10         1    2061 white
## # ℹ 24,990 more rows
virtual_prop_red_25 <- virtual_samples_25 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_25
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1    10     0.4 
##  2         2     9     0.36
##  3         3     9     0.36
##  4         4     6     0.24
##  5         5     4     0.16
##  6         6     8     0.32
##  7         7     7     0.28
##  8         8     8     0.32
##  9         9     9     0.36
## 10        10     7     0.28
## # ℹ 990 more rows
virtual_prop_red_25 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

“Mittlere Schaufel” (Stichprobengröße n = 50)

n <- 50
virtual_samples_50 <- bowl %>%
                         rep_sample_n(size = n,
                                      reps = rep)
virtual_prop_red_50 <- virtual_samples_50 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_50
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1    20     0.4 
##  2         2    21     0.42
##  3         3    16     0.32
##  4         4    19     0.38
##  5         5    16     0.32
##  6         6    23     0.46
##  7         7    19     0.38
##  8         8    19     0.38
##  9         9    21     0.42
## 10        10    11     0.22
## # ℹ 990 more rows
virtual_prop_red_50 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

“Große Schaufel” (Stichprobengröße n = 100)

n<-100
virtual_samples_100 <- bowl %>%
                          rep_sample_n(size = n,
                                       reps = rep)
virtual_samples_100
## # A tibble: 100,000 × 3
## # Groups:   replicate [1,000]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1    1852 red  
##  2         1     108 white
##  3         1     214 white
##  4         1    1378 red  
##  5         1     451 red  
##  6         1    1681 red  
##  7         1    1205 red  
##  8         1    1420 red  
##  9         1    1428 red  
## 10         1     131 white
## # ℹ 99,990 more rows
virtual_prop_red_100 <- virtual_samples_100 %>%
                              group_by(replicate) %>%
                              summarize(red = sum(color == "red")) %>%
                              mutate(prop_red = red / n)
virtual_prop_red_100
## # A tibble: 1,000 × 3
##    replicate   red prop_red
##        <int> <int>    <dbl>
##  1         1    40     0.4 
##  2         2    30     0.3 
##  3         3    32     0.32
##  4         4    39     0.39
##  5         5    40     0.4 
##  6         6    42     0.42
##  7         7    39     0.39
##  8         8    38     0.38
##  9         9    38     0.38
## 10        10    31     0.31
## # ℹ 990 more rows
virtual_prop_red_100 %>% 
  ggplot(aes(x = prop_red)) +
      geom_histogram(binwidth = 0.05, boundary = 0.4, color = "white") +
      labs(x = paste0("Anteil roter Kugeln (aus ",n,")"),
           title = paste0("Kleine Schaufel (n=",n,")")) +
      geom_vline(xintercept = red_true$Anteil_rot,color='red')

Grundproblem der induktiven Statistik

\(\Rightarrow\) Bedeutung der Zufallsaufwahl bei der Datenerhebung (!)

Wenn der wahre Wert unbekannt ist

(Virtuelles) Ziehen einer Stichprobe

n <- 50
virtual_shovel <- bowl %>% 
                     rep_sample_n(size = n)
virtual_shovel
## # A tibble: 50 × 3
## # Groups:   replicate [1]
##    replicate ball_ID color
##        <int>   <int> <chr>
##  1         1    1710 white
##  2         1    2162 red  
##  3         1    2205 red  
##  4         1    1431 white
##  5         1    1254 white
##  6         1    1057 white
##  7         1    2054 white
##  8         1       4 red  
##  9         1     980 white
## 10         1     454 red  
## # ℹ 40 more rows

Anzahl und Anteil der roten Kugeln

virtual_shovel %>% 
  summarize(num_red = sum(color == "red")) %>% 
  mutate(prop_red = num_red /n)
## # A tibble: 1 × 3
##   replicate num_red prop_red
##       <int>   <int>    <dbl>
## 1         1      18     0.36

Statistische Inferenz

Stichprobenverteilung

Reliabilität und Validität

Stichprobenverteilungen…

…und wahrer Wert (rote Linien)

Vergleich der Standardfehler

virtual_prop_red_25 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0949
virtual_prop_red_50 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0674
virtual_prop_red_100 %>%
  summarize(sd = sd(prop_red))
## # A tibble: 1 × 1
##       sd
##    <dbl>
## 1 0.0488

Intervallschätzung und Konfidenzintervall

Punktschätzung vs. Intervallschätzung

Ermittlung von Konfidenzintervallen

Einzelne Stichprobe aus dem physischen Experiment

bowl_sample_1
## # A tibble: 50 × 1
##    color
##    <chr>
##  1 white
##  2 white
##  3 red  
##  4 red  
##  5 white
##  6 white
##  7 red  
##  8 white
##  9 white
## 10 white
## # ℹ 40 more rows
stats_sample_1 <- bowl_sample_1 %>%
                  summarize(Anzahl_rot=sum(color=='red'),
                            Anteil_rot=sum(color=='red')/
                                                length(color))
stats_sample_1
## # A tibble: 1 × 2
##   Anzahl_rot Anteil_rot
##        <int>      <dbl>
## 1         21       0.42

In der Stichprobe von Ilyas und Yohan befinden sich insgesamt 21 rote Kugeln, d.h. der Anteil roter Kugeln entspricht in ihrer Stichprobe 42%.

Nutzung der Funktionen im infer-Paket

Schritt 1: specify()

library(infer)
#bowl_sample_1 %>%        
#    specify(response = color)   # funktioniert nicht - "success" (also das "Ereignis A") muss definiert werden!

bowl_sample_1 %>%
    specify(response = color, success = "red")
## Response: color (factor)
## # A tibble: 50 × 1
##    color
##    <fct>
##  1 white
##  2 white
##  3 red  
##  4 red  
##  5 white
##  6 white
##  7 red  
##  8 white
##  9 white
## 10 white
## # ℹ 40 more rows

Schritt 2: generate()

bowl_sample_1 %>%
  specify(response = color, success = "red") %>%
  generate(reps = 1000, type = "bootstrap")
## Response: color (factor)
## # A tibble: 50,000 × 2
## # Groups:   replicate [1,000]
##    replicate color
##        <int> <fct>
##  1         1 white
##  2         1 red  
##  3         1 red  
##  4         1 red  
##  5         1 white
##  6         1 white
##  7         1 red  
##  8         1 red  
##  9         1 red  
## 10         1 red  
## # ℹ 49,990 more rows

Schritt 3: calculate()

sample_1_bootstrap <- bowl_sample_1 %>%
                           specify(response = color,
                                   success = "red") %>%
                           generate(reps = 1000,
                                    type = "bootstrap") %>%
                           calculate(stat = "prop")
sample_1_bootstrap
## Response: color (factor)
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1  0.46
##  2         2  0.36
##  3         3  0.52
##  4         4  0.42
##  5         5  0.36
##  6         6  0.44
##  7         7  0.52
##  8         8  0.42
##  9         9  0.4 
## 10        10  0.36
## # ℹ 990 more rows

Schritt 4: visualize()

## # A tibble: 1 × 2
##   lower_ci upper_ci
##      <dbl>    <dbl>
## 1     0.28     0.56
sample_1_bootstrap %>%
        visualize(bins = 15) +
        shade_confidence_interval(endpoints = percentile_ci_1) +
        geom_vline(xintercept = 0.42, linetype = "dashed")

Interpretation Konfidenzintervall

Perzentil-Methode

Standardfehler-Methode

Stichprobenverteilung vs. Bootstrap-Verteilung

Stichprobenverteilung

## # A tibble: 1 × 1
##       se
##    <dbl>
## 1 0.0665

Bootstrapping-Verteilung

## # A tibble: 1 × 1
##       se
##    <dbl>
## 1 0.0693